#Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
#Read data from CSV file
nf = pd.read_csv('netflix_titles.csv')
#Original data with first 5 rows
nf.head(5)
| show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | September 25, 2021 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
| 1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
| 2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
| 3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | September 24, 2021 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
| 4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | September 24, 2021 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
#Basic information about the dataset
nf.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8807 entries, 0 to 8806 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 8807 non-null object 1 type 8807 non-null object 2 title 8807 non-null object 3 director 6173 non-null object 4 cast 7982 non-null object 5 country 7976 non-null object 6 date_added 8797 non-null object 7 release_year 8807 non-null int64 8 rating 8803 non-null object 9 duration 8804 non-null object 10 listed_in 8807 non-null object 11 description 8807 non-null object dtypes: int64(1), object(11) memory usage: 825.8+ KB
print("There are {} rows and {} columns in the dataset.".format(nf.shape[0],nf.shape[1]))
There are 8807 rows and 12 columns in the dataset.
nf['type'].value_counts()
Movie 6131 TV Show 2676 Name: type, dtype: int64
#Create new columns with year added
nf['date_added'] = pd.to_datetime(nf['date_added'])
nf['year_added'] = nf['date_added'].dt.year
#Change release_year to numeric data
nf['year_release'] = nf['release_year'].dropna().apply(lambda x : int(x))
#Drop 'date_added' and 'release_year' because we got altervatives
nf.drop(['date_added','release_year'], axis=1, inplace=True)
#Unique values in rating columns
nf['rating'].unique()
array(['PG-13', 'TV-MA', 'PG', 'TV-14', 'TV-PG', 'TV-Y', 'TV-Y7', 'R',
'TV-G', 'G', 'NC-17', '74 min', '84 min', '66 min', 'NR', nan,
'TV-Y7-FV', 'UR'], dtype=object)
#Function to count the number of contents having "minutes" as duration
#1: min
#0: seasons
def count(duration):
x = 0
if 'min' in duration:
x += 1
return x
dur = nf['duration'].dropna().apply(lambda x: count(x))
dur.value_counts()
1 6128 0 2676 Name: duration, dtype: int64
nf['Season'] = nf['duration'].dropna().apply(lambda x: x[:2] if ('Season' in x or 'Seasons' in x) else 0)
nf['Season'].isna().sum()
3
#Return only number of minutes / "0" if it contains season in duration
nf['duration'] = nf['duration'].dropna().apply(lambda x: x[:-4] if 'min' in x else 0)
nf['duration'].unique()
array(['90', 0, '91', '125', '104', '127', '67', '94', '161', '61', '166',
'147', '103', '97', '106', '111', '110', '105', '96', '124', '116',
'98', '23', '115', '122', '99', '88', '100', '102', '93', '95',
'85', '83', '113', '13', '182', '48', '145', '87', '92', '80',
'117', '128', '119', '143', '114', '118', '108', '63', '121',
'142', '154', '120', '82', '109', '101', '86', '229', '76', '89',
'156', '112', '107', '129', '135', '136', '165', '150', '133',
'70', '84', '140', '78', '64', '59', '139', '69', '148', '189',
'141', '130', '138', '81', '132', '123', '65', '68', '66', '62',
'74', '131', '39', '46', '38', '126', '155', '159', '137', '12',
'273', '36', '34', '77', '60', '49', '58', '72', '204', '212',
'25', '73', '29', '47', '32', '35', '71', '149', '33', '15', '54',
'224', '162', '37', '75', '79', '55', '158', '164', '173', '181',
'185', '21', '24', '51', '151', '42', '22', '134', '177', '52',
'14', '53', '8', '57', '28', '50', '9', '26', '45', '171', '27',
'44', '146', '20', '157', '17', '203', '41', '30', '194', '233',
'237', '230', '195', '253', '152', '190', '160', '208', '180',
'144', '5', '174', '170', '192', '209', '187', '172', '16', '186',
'11', '193', '176', '56', '169', '40', '10', '3', '168', '312',
'153', '214', '31', '163', '19', nan, '179', '43', '200', '196',
'167', '178', '228', '18', '205', '201', '191'], dtype=object)
#Change duration from object to int
#0 min --> 1969 has at least 1 season
nf['duration'] = pd.to_numeric(nf['duration'])
nf['duration'].value_counts()
0.0 2676
90.0 152
97.0 146
94.0 146
93.0 146
...
212.0 1
8.0 1
186.0 1
193.0 1
191.0 1
Name: duration, Length: 206, dtype: int64
#Change season int
#0 season --> 4265 contents have 1 episode
nf['Season'] = pd.to_numeric(nf['Season'])
nf['Season'].value_counts()
0.0 6128 1.0 1793 2.0 425 3.0 199 4.0 95 5.0 65 6.0 33 7.0 23 8.0 17 9.0 9 10.0 7 13.0 3 15.0 2 12.0 2 11.0 2 17.0 1 Name: Season, dtype: int64
#Overall missing data from each column
nf.isnull().sum()
show_id 0 type 0 title 0 director 2634 cast 825 country 831 rating 4 duration 3 listed_in 0 description 0 year_added 10 year_release 0 Season 3 dtype: int64
#Missing data in graph
plt.figure(figsize=(10,5))
sns.heatmap(nf.isnull())
plt.show()
nf.isnull().sum()/len(nf)*100
show_id 0.000000 type 0.000000 title 0.000000 director 29.908028 cast 9.367549 country 9.435676 rating 0.045418 duration 0.034064 listed_in 0.000000 description 0.000000 year_added 0.113546 year_release 0.000000 Season 0.034064 dtype: float64
nf.drop(['director', 'cast', 'description'], axis=1, inplace=True)
# Filling all the missing values in the 'country' column with United States
#as Netflix was created in the USA and every show is aired on Netflix US.
nf['country'].replace(np.nan, 'United States', inplace=True)
# Dropna to drop all other missing data as it conly accounts for 0.1% of the dataset
nf.dropna(inplace=True)
#Double check missing data table
nf.isnull().sum()/len(nf)*100
show_id 0.0 type 0.0 title 0.0 country 0.0 rating 0.0 duration 0.0 listed_in 0.0 year_added 0.0 year_release 0.0 Season 0.0 dtype: float64
#Review data after cleaning process
nf.head()
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | United States | PG-13 | 90.0 | Documentaries | 2021.0 | 2020 | 0.0 |
| 1 | s2 | TV Show | Blood & Water | South Africa | TV-MA | 0.0 | International TV Shows, TV Dramas, TV Mysteries | 2021.0 | 2021 | 2.0 |
| 2 | s3 | TV Show | Ganglands | United States | TV-MA | 0.0 | Crime TV Shows, International TV Shows, TV Act... | 2021.0 | 2021 | 1.0 |
| 3 | s4 | TV Show | Jailbirds New Orleans | United States | TV-MA | 0.0 | Docuseries, Reality TV | 2021.0 | 2021 | 1.0 |
| 4 | s5 | TV Show | Kota Factory | India | TV-MA | 0.0 | International TV Shows, Romantic TV Shows, TV ... | 2021.0 | 2021 | 2.0 |
# Consider 2 types:
plt.figure(figsize=(10,5))
plt.pie(x=nf['type'].value_counts(), labels=nf['type'].value_counts().index,
explode=[0.02,0], autopct = '%1.2f%%')
plt.title('Types of Content', fontsize=12, fontweight='bold')
plt.show()
#Explore country column
nf['country'].value_counts()
United States 3638
India 972
United Kingdom 418
Japan 243
South Korea 199
...
Romania, Bulgaria, Hungary 1
Uruguay, Guatemala 1
France, Senegal, Belgium 1
Mexico, United States, Spain, Colombia 1
United Arab Emirates, Jordan 1
Name: country, Length: 748, dtype: int64
from collections import Counter
country_data = nf['country']
#Split the input due to lists of many countries
country = ','.join(country_data).replace(' ,',',').replace(', ',',').split(',')
count = Counter(country)
#count
country_count = pd.Series(dict(count)).sort_values(ascending=False)
#TOP 10 Countries
top10_country = country_count.head(10)
top10_country
# nf['country'].values
United States 4510 India 1046 United Kingdom 805 Canada 445 France 393 Japan 316 Spain 232 South Korea 231 Germany 226 Mexico 169 dtype: int64
#Graph of top 10 countries
x = top10_country.index
y = top10_country
from matplotlib import gridspec
fig = plt.figure(figsize=(20, 6))
gs = gridspec.GridSpec(nrows=1, ncols=2, height_ratios=[6], width_ratios=[10, 5])
axes1 = plt.subplot(gs[0])
sns.barplot(x=x, y=y, ax=axes1, palette="RdGy")
axes1.set_xticklabels(x)
axes1.set_title('Top 10 countries', fontsize=15, fontweight='bold')
axes2 = plt.subplot(gs[1])
axes2.pie(y, labels=x, shadow=True, colors=sns.color_palette("RdGy", n_colors=20),
autopct='%1.2f%%')
axes2.axis('equal')
plt.show()
#Concatenate dataframes of top 10 countries from original dataset into a new data frame called top_10
top_10=nf[(nf['country']=='United States')|(nf['country']=='India')|(nf['country']=='United Kingdom')|
(nf['country']=='Japan')|(nf['country']=='Canada')|(nf['country']=='Spain')|(nf['country']=='France')|
(nf['country']=='South Korea')|(nf['country']=='Germany')|(nf['country']=='Mexico')]
top_10.head()
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | United States | PG-13 | 90.0 | Documentaries | 2021.0 | 2020 | 0.0 |
| 2 | s3 | TV Show | Ganglands | United States | TV-MA | 0.0 | Crime TV Shows, International TV Shows, TV Act... | 2021.0 | 2021 | 1.0 |
| 3 | s4 | TV Show | Jailbirds New Orleans | United States | TV-MA | 0.0 | Docuseries, Reality TV | 2021.0 | 2021 | 1.0 |
| 4 | s5 | TV Show | Kota Factory | India | TV-MA | 0.0 | International TV Shows, Romantic TV Shows, TV ... | 2021.0 | 2021 | 2.0 |
| 5 | s6 | TV Show | Midnight Mass | United States | TV-MA | 0.0 | TV Dramas, TV Horror, TV Mysteries | 2021.0 | 2021 | 1.0 |
#Graph showing the distribution of Movies and Tv Shows in each country of top 10, then the comparison
plt.figure(figsize=(15,6))
sns.countplot(x='country',hue='type',data=top_10, order=top10_country.index, palette="rocket")
plt.title('Comparison between 2 Types of Top 10 Countries',fontweight='bold')
plt.show()
#Create 2 new DF to seperate TV Shows and Movies
nf_tv = nf[nf['type']=='TV Show']
nf_movie = nf[nf['type']=='Movie']
# nf_movie.head()
#Overall the occurence of each year that the contents were added
nf['year_added'].value_counts()
2019.0 2016 2020.0 1879 2018.0 1648 2021.0 1498 2017.0 1185 2016.0 426 2015.0 82 2014.0 24 2011.0 13 2013.0 11 2012.0 3 2009.0 2 2008.0 2 2010.0 1 Name: year_added, dtype: int64
#Create df for the above information
nf_content = nf['year_added'].value_counts().reset_index().rename(columns={'index':'year_added', 'year_added':'count'})
#Add percent column to get better view of how much the year accounts for
nf_content = nf_content.sort_values('year_added')
nf_content['percent'] = nf_content['count'].apply(lambda x : 100*x/sum(nf_content['count']))
nf_content
| year_added | count | percent | |
|---|---|---|---|
| 12 | 2008.0 | 2 | 0.022753 |
| 11 | 2009.0 | 2 | 0.022753 |
| 13 | 2010.0 | 1 | 0.011377 |
| 8 | 2011.0 | 13 | 0.147895 |
| 10 | 2012.0 | 3 | 0.034130 |
| 9 | 2013.0 | 11 | 0.125142 |
| 7 | 2014.0 | 24 | 0.273038 |
| 6 | 2015.0 | 82 | 0.932878 |
| 5 | 2016.0 | 426 | 4.846416 |
| 4 | 2017.0 | 1185 | 13.481229 |
| 2 | 2018.0 | 1648 | 18.748578 |
| 0 | 2019.0 | 2016 | 22.935154 |
| 1 | 2020.0 | 1879 | 21.376564 |
| 3 | 2021.0 | 1498 | 17.042093 |
movie = nf_movie['year_added'].value_counts().reset_index().rename(columns = { 'index' : 'year_added',
'year_added' : 'count', }).sort_values('year_added')
movie['percent'] = movie['count'].apply(lambda x : 100*x/sum(movie['count']))
movie
| year_added | count | percent | |
|---|---|---|---|
| 12 | 2008.0 | 1 | 0.016324 |
| 11 | 2009.0 | 2 | 0.032648 |
| 13 | 2010.0 | 1 | 0.016324 |
| 8 | 2011.0 | 13 | 0.212210 |
| 10 | 2012.0 | 3 | 0.048972 |
| 9 | 2013.0 | 6 | 0.097943 |
| 7 | 2014.0 | 19 | 0.310153 |
| 6 | 2015.0 | 56 | 0.914136 |
| 5 | 2016.0 | 251 | 4.097290 |
| 4 | 2017.0 | 836 | 13.646752 |
| 2 | 2018.0 | 1237 | 20.192622 |
| 0 | 2019.0 | 1424 | 23.245184 |
| 1 | 2020.0 | 1284 | 20.959843 |
| 3 | 2021.0 | 993 | 16.209598 |
tv = nf_tv['year_added'].value_counts().reset_index().rename(columns={'index':'year_added',
'year_added':'count'}).sort_values('year_added')
tv['percent'] = tv['count'].apply(lambda x: 100*x/sum(tv['count']))
tv
| year_added | count | percent | |
|---|---|---|---|
| 9 | 2008.0 | 1 | 0.037538 |
| 8 | 2013.0 | 5 | 0.187688 |
| 7 | 2014.0 | 5 | 0.187688 |
| 6 | 2015.0 | 26 | 0.975976 |
| 5 | 2016.0 | 175 | 6.569069 |
| 4 | 2017.0 | 349 | 13.100601 |
| 3 | 2018.0 | 411 | 15.427928 |
| 1 | 2019.0 | 592 | 22.222222 |
| 0 | 2020.0 | 595 | 22.334835 |
| 2 | 2021.0 | 505 | 18.956456 |
#Create interactive graph to visualize the numbers
import plotly.graph_objects as go
p1 = go.Scatter(x=movie['year_added'], y=movie["count"], name="Movies", marker=dict(color="#a678de"))
p2 = go.Scatter(x=tv['year_added'], y=tv["count"], name="TV Shows", marker=dict(color="#6ad49b"))
p3 = go.Scatter(x=nf_content['year_added'], y=nf_content["count"], name="Total Contents", marker=dict(color="brown"))
layout = go.Layout(title="<b>Content Added over Years", title_x=0.5,
font=dict(family="Arial",size=10, color='black'),
legend=dict(x=0.1, y=1.1, orientation="h"))
fig = go.Figure(data=[p1,p2,p3], layout=layout)
fig.show()
plt.figure(figsize=(15,4))
sns.countplot(x='year_release', hue='type', data=nf[nf['year_release']>2013])
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=90)
plt.title('Contents Released over Years',fontweight='bold')
plt.show()
# Understanding Netflix rating based on ages
ages = {'TV-PG': 'Kids',
'TV-MA': 'Adults',
'TV-Y7-FV': 'Kids',
'TV-Y7': 'Kids',
'TV-14': 'Teens',
'R': 'Adults',
'TV-Y': 'Kids',
'NR': 'Adults',
'PG-13': 'Teens',
'TV-G': 'Kids',
'PG': 'Kids',
'G': 'Kids',
'UR': 'Adults',
'NC-17': 'Adults'
}
#Replace the existed rating with an easier term as in the list
nf['ages'] = nf['rating'].replace(ages)
nf['ages']
0 Teens
1 Adults
2 Adults
3 Adults
4 Adults
...
8802 Adults
8803 Kids
8804 Adults
8805 Kids
8806 Teens
Name: ages, Length: 8790, dtype: object
# Group nf['rating'] and ['ages'], count number of show_id(s) in them
rating_nf = nf.groupby(['rating', 'ages']).agg({'show_id': 'count'}).reset_index()
# Change column's name
rating_nf.columns = ['rating', 'ages', 'counts']
# Group data by ages
rating_nf = rating_nf.sort_values('ages')
rating_nf
| rating | ages | counts | |
|---|---|---|---|
| 1 | NC-17 | Adults | 3 |
| 2 | NR | Adults | 79 |
| 5 | R | Adults | 799 |
| 8 | TV-MA | Adults | 3205 |
| 13 | UR | Adults | 3 |
| 0 | G | Kids | 41 |
| 3 | PG | Kids | 287 |
| 7 | TV-G | Kids | 220 |
| 9 | TV-PG | Kids | 861 |
| 10 | TV-Y | Kids | 306 |
| 11 | TV-Y7 | Kids | 333 |
| 12 | TV-Y7-FV | Kids | 6 |
| 4 | PG-13 | Teens | 490 |
| 6 | TV-14 | Teens | 2157 |
#List of Netflix ratings
group_ages = ["G", "TV-G", "TV-Y", "PG", "TV-PG", "TV-Y7", "TV-Y7-FV",
"PG-13", "TV-14", 'NC-17', "NR", "R", "TV-MA", "UR"]
#Graph showing the distribution of Netflix ratings which are colored by normal terms
plt.figure(figsize=(12,4))
plt.grid('whitegrid')
sns.barplot(x='rating', y='counts', data=rating_nf, hue='ages', order=group_ages, palette="viridis", dodge=False)
plt.title("Distribution of Ratings", fontweight='bold')
Text(0.5, 1.0, 'Distribution of Ratings')
#Recreat 2 dfs with updated ages column
nf_tv = nf[nf['type']=='TV Show']
nf_movie = nf[nf['type']=='Movie']
nf_movie.head()
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | ages | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | United States | PG-13 | 90.0 | Documentaries | 2021.0 | 2020 | 0.0 | Teens |
| 6 | s7 | Movie | My Little Pony: A New Generation | United States | PG | 91.0 | Children & Family Movies | 2021.0 | 2021 | 0.0 | Kids |
| 7 | s8 | Movie | Sankofa | United States, Ghana, Burkina Faso, United Kin... | TV-MA | 125.0 | Dramas, Independent Movies, International Movies | 2021.0 | 1993 | 0.0 | Adults |
| 9 | s10 | Movie | The Starling | United States | PG-13 | 104.0 | Comedies, Dramas | 2021.0 | 2021 | 0.0 | Teens |
| 12 | s13 | Movie | Je Suis Karl | Germany, Czech Republic | TV-MA | 127.0 | Dramas, International Movies | 2021.0 | 2021 | 0.0 | Adults |
#DF of movie rating with different ages and count
movie_rating = nf_movie.groupby(['ages']).agg({'show_id': 'count'}).reset_index()
movie_rating.columns = ['ages', 'count']
movie_rating = movie_rating.sort_values('ages')
movie_rating
| ages | count | |
|---|---|---|
| 0 | Adults | 2940 |
| 1 | Kids | 1269 |
| 2 | Teens | 1917 |
#DF of TV Shows rating with different ages and count
tv_rating = nf_tv.groupby(['ages']).agg({'show_id': 'count'}).reset_index()
tv_rating.columns = ['ages', 'count']
tv_rating = tv_rating.sort_values('ages')
tv_rating
| ages | count | |
|---|---|---|
| 0 | Adults | 1149 |
| 1 | Kids | 785 |
| 2 | Teens | 730 |
#Graph showing distribution of ages in each type, then comparison
fig,axes = plt.subplots(nrows=1,ncols=2, figsize=(15,8))
axes[0].pie(x=movie_rating['count'], labels=movie_rating['ages'], autopct='%1.2f%%')
axes[0].set_title('Distribution of Movies Rating', fontweight="bold", y=-0.01)
axes[1].pie(x=tv_rating['count'], labels=tv_rating['ages'], autopct='%1.2f%%')
axes[1].set_title('Distribution of TV Shows Rating', fontweight="bold", y=-0.01)
Text(0.5, -0.01, 'Distribution of TV Shows Rating')
#DF of duration in min
duration_movie = nf[nf['duration'] != 0]
duration_movie.head(2)
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | ages | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | s1 | Movie | Dick Johnson Is Dead | United States | PG-13 | 90.0 | Documentaries | 2021.0 | 2020 | 0.0 | Teens |
| 6 | s7 | Movie | My Little Pony: A New Generation | United States | PG | 91.0 | Children & Family Movies | 2021.0 | 2021 | 0.0 | Kids |
duration_movie[duration_movie["Season"] == 1].info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 0 entries Data columns (total 11 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 0 non-null object 1 type 0 non-null object 2 title 0 non-null object 3 country 0 non-null object 4 rating 0 non-null object 5 duration 0 non-null float64 6 listed_in 0 non-null object 7 year_added 0 non-null float64 8 year_release 0 non-null int64 9 Season 0 non-null float64 10 ages 0 non-null object dtypes: float64(3), int64(1), object(7) memory usage: 0.0+ bytes
#Graph showing distribution of length of movie, distinguished by the normal ages
plt.figure(figsize=(12,4))
plt.grid('whitegrid')
sns.histplot(x='duration', data=duration_movie, bins=30, hue='ages', palette="viridis")
plt.title('Duration of Movies',fontweight="bold")
plt.show()
duration_tv = nf[nf['Season'] != 0]
duration_tv.head(2)
| show_id | type | title | country | rating | duration | listed_in | year_added | year_release | Season | ages | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | s2 | TV Show | Blood & Water | South Africa | TV-MA | 0.0 | International TV Shows, TV Dramas, TV Mysteries | 2021.0 | 2021 | 2.0 | Adults |
| 2 | s3 | TV Show | Ganglands | United States | TV-MA | 0.0 | Crime TV Shows, International TV Shows, TV Act... | 2021.0 | 2021 | 1.0 | Adults |
#Graph showing distribution of length of movie, distinguished by the normal ages
plt.figure(figsize=(12,4))
plt.grid('whitegrid')
sns.countplot(x='Season', data=duration_tv, hue='ages', palette="viridis")
plt.title('Duration of TV Shows',fontweight="bold")
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
#Import mlb from scikit-learn
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
#Fuction to find number of different kinds("listed_in") of each content(movies/TV Shows)
def count(nf, content):
#Split the list in original content
nf['genre'] = nf['listed_in'].apply(lambda x : x.replace(' ,',',').replace(', ',',').split(','))
#nf['genre'] contains lists --> loop through to get number of unique
kinds = []
for i in nf['genre']:
kinds += i
kinds = set(kinds)
return "There are {} types in the Netflix {} Dataset".format(len(kinds),content)
#Fuction for relation heatmap showing the relationship between different kinds
def relation_heatmap(nf, content):
#Fit the label sets binarizer and transform the given label sets.
x = mlb.fit_transform(nf['genre'])
#A copy of the classes parameter when provided
y = mlb.classes_
#Create new df for correlation
df = pd.DataFrame(x, columns=y, index=nf['genre'].index)
corr = df.corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
fig, ax = plt.subplots(figsize=(10, 7))
pl = sns.heatmap(corr, mask = mask, cmap= "viridis", vmin=-.5, vmax=.5, square=True, linewidths=.7)
plt.title(content + " Genre", fontweight='bold')
plt.show()
count(nf_movie, 'Movie')
C:\Users\nhung\AppData\Local\Temp/ipykernel_16344/3090259903.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
'There are 20 types in the Netflix Movie Dataset'
relation_heatmap(nf_movie, 'Movie')
C:\Users\nhung\AppData\Local\Temp/ipykernel_16344/1211764482.py:13: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
count(nf_tv, 'TV Show')
C:\Users\nhung\AppData\Local\Temp/ipykernel_16344/3090259903.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
'There are 22 types in the Netflix TV Show Dataset'
relation_heatmap(nf_tv, 'TV Show')
C:\Users\nhung\AppData\Local\Temp/ipykernel_16344/1211764482.py:13: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here. Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
#Graph showing distribution of the most 10 popular kinds of each content
fig, axes = plt.subplots(1, 2, figsize=(15, 5), sharey=True)
#10 Movie Genres
sns.barplot(ax=axes[0],
x = nf_movie["listed_in"].value_counts().head(10).index,
y = nf_movie["listed_in"].value_counts().head(10).values,palette="RdGy")
axes[0].set_title("Top 10 Genre in Movies", fontweight='bold')
#10 TV Show Genres
sns.barplot(ax=axes[1],
x = nf_tv["listed_in"].value_counts().head(10).index,
y = nf_tv["listed_in"].value_counts().head(10).values,palette="RdGy")
axes[1].set_title("Top 10 Genre in TV Shows", fontweight='bold')
#Rotate text in x-axis for better look
for ax in fig.axes:
plt.sca(ax)
plt.xticks(rotation=80)
plt.show()